Dataviz in R

Brown bag seminar, June 18, 2015

Markus Kainu
ESS, Team F

Creative Commons Licence

Content

  1. A biased intro of (statistical) graphics
  2. Why to use R for graphics
  3. Case 1: Static graphics for print & web
  4. Case 2: interactive graphics & applications for the web

Graphics glossary

Static graphics

bitmap vector
file extensions .jpg, .png, .gif .eps, .pdf, .svg, .ai
common example digital photo google maps
consists of million of pixels points, lines and polygons
file size large small
software Gimp (Photoshop) Inkscape (Illustrator)
good for web, printing (in high-res) print, post-processing
exclusively used digital photography maps and graphs

You can make vector graphics into bitmap graphics, but not the other way around

Interactive graphics

interactive graphics interactive web applications
typical example one-off javascript plot Trade data explorer
look and feel amazing but limited amazing and unlimited
depends online libraries, shipped with data Needs a full R instance
R htmlwidgets shiny
good for showing off & showing off & explorative wiz

Why to R and why to script your plots?

Case 1: Static graphics for print & web

Population data from FAOSTAT

library(FAOSTAT)
library(knitr)
library(dplyr)
library(ggplot2)
dat <- getFAOtoSYB(domainCode = "OA", elementCode = 551,itemCode = 3010) # Download rural population
dat1 <- dat$entity
dat <- getFAOtoSYB(domainCode = "OA", elementCode = 561,itemCode = 3010) # Download urban population
dat2 <- dat$entity
dat <- merge(dat1,dat2,by=c("FAOST_CODE","Year"))
dim(dat)
[1] 19854     4
kable(head(dat)) # print 5 first rows
FAOST_CODE Year OA_3010_551 OA_3010_561
100 1961 375928 82699
100 1962 382709 85253
100 1963 389708 87908
100 1964 396938 90670
100 1965 404412 93541
100 1966 412128 96528

Population data from FAOSTAT

names(dat) <- c("FAOST_CODE","Year","rural_pop","urban_pop")
dat$total_pop <- dat$rural_pop + dat$urban_pop # Compute total population
dat$rural_share <- round(dat$rural_pop / dat$total_pop * 100,1) # Compute rural population share of total population
library(countrycode) # Add the country name & continent using countrycode-package
dat$FAOST_CODE[dat$FAOST_CODE == 41] <- 351
dat$cntry <- countrycode(dat$FAOST_CODE, origin = "fao", destination = "country.name")
dat$cont <- countrycode(dat$FAOST_CODE, origin = "fao", destination = "continent")
dat <- na.omit(dat)
dim(dat)
[1] 16475     8
kable(head(dat)) # print 5 first rows
FAOST_CODE Year rural_pop urban_pop total_pop rural_share cntry cont
100 1961 375928 82699 458627 82.0 India Asia
100 1962 382709 85253 467962 81.8 India Asia
100 1963 389708 87908 477616 81.6 India Asia
100 1964 396938 90670 487608 81.4 India Asia
100 1965 404412 93541 497953 81.2 India Asia
100 1966 412128 96528 508656 81.0 India Asia
library(ggplot2)
library(dplyr)
p <- ggplot(data=dat, 
            aes(x=rural_pop,y=urban_pop))
p + geom_point()

plot of chunk unnamed-chunk-5

pie_data <- dat %>% filter(Year == 2015) %>% 
  group_by(cont) %>% summarise(share = sum(rural_pop))
pie_data <- pie_data %>% mutate(sum = sum(share)) 
p <- ggplot(pie_data, aes(x=sum/2, y = share, fill =cont, width = sum))
p <- p + geom_bar(position="fill", stat="identity") 
p + coord_polar("y")

plot of chunk unnamed-chunk-6

plot_data <- dat[dat$Year == 2015,]
p <- ggplot(data=plot_data, 
            aes(x=cntry,y=rural_share))
p + geom_bar(stat="identity")

plot of chunk unnamed-chunk-7

p <- ggplot(data=dat, 
            aes(x=Year,y=rural_share,group=cntry))
p + geom_point() + geom_line()

plot of chunk unnamed-chunk-8

Share of rural population

plot_data <- dat %>% filter(Year == 2015)
p <- ggplot(data=plot_data, 
            aes(x=cntry,y=rural_share))
p + geom_bar(stat="identity")

plot of chunk unnamed-chunk-9

Share of rural population

plot_data$cntry <- factor(plot_data$cntry, levels=arrange(plot_data, -rural_share)$cntry)
p <- ggplot(data=plot_data, 
            aes(x=cntry,y=rural_share))
p + geom_bar(stat="identity")

plot of chunk unnamed-chunk-10

Share of rural population

plot_data <- arrange(plot_data, -rural_share)[1:20,] # top 20
p <- ggplot(data=plot_data, aes(x=cntry,y=rural_share,fill=cont))
p + geom_bar(stat="identity")

plot of chunk unnamed-chunk-11

Share of rural population

p <- ggplot(data=plot_data, aes(x=cntry,y=rural_share,fill=cont))
p <- p + geom_bar(stat="identity")
p + coord_flip()

plot of chunk unnamed-chunk-12

p <- ggplot(data=plot_data, aes(x=cntry,y=rural_share,fill=cont))
p <- p + geom_bar(stat="identity")
p <- p + coord_flip()
p <- p + scale_fill_manual(values=c("#1b9e77","#d95f02","#7570b3","#e7298a","#66a61e"))
p <- p + theme_minimal()
p <- p + theme(text = element_text(family = "Open Sans"),
               title = element_text(size = 20),
               axis.title = element_text(size=14),
               legend.position = "top")
p <- p + labs(title="Share of rural population", x=NULL,y="Share of rural population (%)")
p <- p + guides(fill = guide_legend(title = "Continent", title.position = "left", title.hjust=.5))
p

plot of chunk unnamed-chunk-13

Static Choropleth maps

library(gisfao)
library(scales)
library(tidyr)
library(stringr)
shape <- fao_world
FAOST_CODE <- as.character(shape$FAOST_CODE)
df.d <- data.frame(FAOST_CODE, VarX = rep(NA, nrow(shape)))
dat$FAOST_CODE[dat$FAOST_CODE == 351] <- 41
mapdata <- merge(dat %>% filter(Year %in% c(1965,1980,2000,2015)), df.d, by.x = "FAOST_CODE", all.y = TRUE)
mapdata <- mapdata[c("FAOST_CODE","Year","rural_share")]
mapdata$Year <- paste0("X",mapdata$Year)
mapdata <- spread(mapdata, Year, rural_share)
mapdata$XNA <- NULL
row.names(mapdata) <- mapdata$FAOST_CODE
row.names(shape) <- as.character(shape$FAO_CODE)
mapdata <- mapdata[order(row.names(mapdata)), ]
shape <- shape[order(row.names(shape)), ]
shape2 <- maptools::spCbind(shape, mapdata)
shape2$id <- rownames(shape2@data)
map.points <- fortify(shape2, region = "id")
map.df <- merge(map.points, shape2, by = "id")

map.df <- gather(map.df, "Year", "rural_share", 28:31)
map.df$Year <- str_replace_all(map.df$Year, "X", "Year ")
map <- ggplot(data=map.df, aes(long,lat,group=group))
map <- map  + geom_polygon(aes(fill = rural_share),colour=alpha("white", 3/4),size=.2)
map <- map + theme(legend.position = "top", 
                          legend.background=element_rect(colour=NA, fill=NA),
                          axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
map <- map + facet_wrap(~Year, ncol = 2)

Static Choropleth maps

map

plot of chunk unnamed-chunk-15

Animated choropleth maps

library(gisfao)
library(scales)
library(tidyr)
library(stringr)
library(RColorBrewer)
library(gridExtra)
shape <- fao_world
FAOST_CODE <- as.character(shape$FAOST_CODE)
df.d <- data.frame(FAOST_CODE, VarX = rep(NA, nrow(shape)))
dat$FAOST_CODE[dat$FAOST_CODE == 351] <- 41
mapdata <- merge(dat, df.d, by.x = "FAOST_CODE", all.y = TRUE)
mapdata <- mapdata[c("FAOST_CODE","Year","rural_share")]
mapdata$Year <- paste0("Year ",mapdata$Year)
mapdata <- spread(mapdata, Year, rural_share)
mapdata$XNA <- NULL
row.names(mapdata) <- mapdata$FAOST_CODE
row.names(shape) <- as.character(shape$FAO_CODE)
mapdata <- mapdata[order(row.names(mapdata)), ]
shape <- shape[order(row.names(shape)), ]
shape2 <- maptools::spCbind(shape, mapdata)
shape2$id <- rownames(shape2@data)
map.points <- fortify(shape2, region = "id")
map.df <- merge(map.points, shape2, by = "id")
map.df <- gather(map.df, "Year", "rural_share", 28:118)
map.df$Year <- as.character(map.df$Year)
map.df$Year <- str_replace(map.df$Year, "Year.", "")
map.df$Year <- factor(map.df$Year)
map.df$Year <- as.numeric(levels(map.df$Year))[map.df$Year]
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
y_dat <- data.frame()



for(y in unique(map.df$Year)[!is.na(unique(map.df$Year))]) {

  map.df_subset <- map.df[map.df$Year == y,]
  map <- ggplot(data=map.df_subset, aes(long,lat,group=group))
  map <- map  + geom_polygon(aes(fill = rural_share),colour=alpha("white", 3/4),size=.2)
  map <- map + theme(legend.position = "top", 
                          legend.background=element_rect(colour=NA, fill=NA),
                          axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
  map <- map + labs(title=paste("Year",y))
  map <- map + scale_fill_gradientn(limits = c(0,100),colours= myPalette(100))
  # plot
  line_data <- map.df_subset[!duplicated(map.df_subset[c("FAO_CODE")]),]
  line_data <- line_data[c("FAO_CODE","rural_share","ADM0_NAME","Year")]
  y_dat <- rbind(y_dat,line_data)
  plot <- ggplot(y_dat, aes(x=Year,y=rural_share,group=FAO_CODE,color=rural_share)) +
                   geom_line() +
                   coord_cartesian(xlim=c(1960,2050)) +
    scale_color_gradientn(limits = c(0,100),colours= myPalette(100)) +
    coord_cartesian(ylim=c(0,100))


  png(file=paste0("map/Year.",y,".png"), width=1000, height=700, res=120)
  grid.arrange(arrangeGrob(map,plot, widths=c(3/5,2/5), ncol=2))
  dev.off()
}
system("convert -delay 40 -loop 0 map/*.png map/maps.gif")

Exporting for post-processing

# barplot
ggsave(p, file="output/barplot.svg", width=14, height=8)
ggsave(p, file="output/barplot.pdf", width=14, height=8)
ggsave(p, file="output/barplot.png")

ggsave(map, file="output/map.svg", width=14, height=8)
ggsave(map, file="output/map.pdf", width=14, height=8)
ggsave(map, file="output/map.png")

-> post-processing in Inkscape & Gimp

Case 2: interactive graphics & applications for the web

Selected Javascript with R bindings

  • Htmlwidgets - THE R interface to the multiple javascript charting libraries
  • rCharts - Ánother, partly overlapping R interface to multiple javascript charting libraries
  • googleVis - use Google Chart Tools from R
  • ggvis - interactive plots from the makers of ggplot2
  • plotly - convert ggplot2 figures to interactive plots easily

Interactive maps with Leaflet library

library(gisfao)
library(dplyr)
library(sp)
library(leaflet)

# data manipulation
shape <- fao_world
FAOST_CODE <- as.character(shape$FAOST_CODE)
VarX <- rep(NA, nrow(shape))
df.d <- data.frame(FAOST_CODE, VarX)
dat2 <- merge(dat, df.d, by.x = "FAOST_CODE", all.y = TRUE)
row.names(dat2) <- dat2$FAOST_CODE
row.names(shape) <- as.character(shape$FAO_CODE)
dat2 <- dat2[order(row.names(dat2)), ]
shape <- shape[order(row.names(shape)), ]
shape2 <- maptools::spCbind(shape, dat2)

Interactive maps with Leaflet library

# Define the region and color variables and title:
main <- "Rural population"
color = "rural_share"

# Define color palette
palette <- leaflet::colorNumeric(c("#fef0d9","#fdcc8a","#fc8d59","#e34a33","#b30000"), 
                                 domain = shape2$rural_share)

# Define the text for popup box
state_popup <- paste0("<strong>Country: </strong>", shape2$ADM0_NAME, 
                      "<br><strong>", main, ": </strong>", 
                      round(shape2@data[,c(color)], digits=2))

# Generate interactive visualization
leaflet(data = shape2) %>% 
  addTiles() %>% 
  setView(lng = 12.30, lat = 41.54, zoom = 3) %>% 
  addPolygons(fillColor = ~palette(get(color)), 
              fillOpacity = 0.7, 
              color = "#000000", 
              weight = 1,
              popup = state_popup) %>% 
  addLegend("bottomright", pal = palette, values = ~rural_share,
    title = "Share of rural population",
    labFormat = labelFormat(prefix = "%")
  )

Interactive maps with Leaflet library

Conclusions

R and datavisualisations

Pros

  • R has good graphics capabilities already and development is fast
  • Scripting makes your ideas/graphs re-usable
  • organising your code is important
  • growing expertise in ESS/FAO

Cons

  • scripting is slow the first time

Graphics

Static graphics

graphics format pros cons
bitmap graphics reliable limited editing
some-compatible large file sizes
M$ compatible
vector graphics easy to post-process non M$ compatible (excluding .wmf)
small file size
great for print

Interactive graphics

graphics format pros cons
interactive plots development currently fast short life span
lots of different options field unorganised
JS skills still very useful
require internet
can't print!
web applications very flexible hosting & maintenance
any analysis/task R or linux can do requires tailoring

Questions to James?